#
#remove scientific notation
options(scipen=999)

—- Load packages —-

library(stringr)
library(corrplot)
## corrplot 0.84 loaded
library(shiny)

—- Load neighborhood data —-

load("Data/county_factors.rda")
load("Data/county_500CitiesData.rda")

—- Load and format covid data —-

data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_time_series/"

# Read in the data
US.deaths <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = T, stringsAsFactors = F)
US.cases <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = T, stringsAsFactors = F)

# Read in the header seprately.
US.cases.head <- read.csv(
  paste0(data.path, "time_series_covid19_confirmed_US.csv"), 
  header = F, stringsAsFactors = F)[1,]
US.deaths.head <- read.csv(
  paste0(data.path, "time_series_covid19_deaths_US.csv"), 
  header = F, stringsAsFactors = F)[1,]

# Correct the dates in the header to be more useable as
# column names.
proper_date <- function(dates){
  dates <- sapply(dates, strsplit, split = "/")
  dates <- lapply(dates, str_pad, width = 2, side = "left", pad = "0")
  dates <- lapply(dates, paste, collapse = "_")
  dates <- unlist(dates)
  
  return(dates)
}

dates.cases <- proper_date(US.cases.head[-c(1:11)])
dates.deaths <- proper_date(US.deaths.head[-c(1:12)])

names(US.cases) <- c(US.cases.head[1,1:11], dates.cases)
names(US.deaths) <- c(US.deaths.head[1,1:12], dates.deaths)

if(sum(US.cases$UID != US.deaths$UID) > 0){warning("COVID data rows do not match!")}
US.cases$Population <- US.deaths$Population
US.cases <- US.cases[,c(1:11, ncol(US.cases), 12:(ncol(US.cases)-1))]

Other stats within the daily reports

data.path <- "Data/COVID-19/csse_covid_19_data/csse_covid_19_daily_reports_us/"
daily_filenames <- list.files(data.path)
daily_filenames <- daily_filenames[daily_filenames != "README.md"]

todays_report_filename <- daily_filenames[length(daily_filenames)]
US.todaysReport <- read.csv(
  paste0(data.path, todays_report_filename), 
  header = T, stringsAsFactors = F)
all.states <- c('Alabama', 'Alaska', 'American Samoa', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Diamond Princess', 'District of Columbia', 'Florida', 'Georgia', 'Grand Princess', 'Guam', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Northern Mariana Islands', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virgin Islands', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming')
all.states.df <- data.frame(Province_State = all.states)
all.stats <- c("Confirmed", "Deaths", "Recovered", "Active", "Incident_Rate", "People_Tested", "People_Hospitalized", "Mortality_Rate", "Testing_Rate", "Hospitalization_Rate")

compiled.stats <- list()
for(i in 1:length(daily_filenames)){
  day <- substring(daily_filenames[i],1,10)
  data <- read.csv(
    paste0(data.path, daily_filenames[i]),
    header = T, stringsAsFactors = F)
  compiled.stats[[i]] <- merge(all.states.df, data, all.y = F)
  names(compiled.stats)[i] <- day
}

Functions for compiling and visualizing stats in the daily reports.

plot.dailyStat <- function(state, stat){
  data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
  names(data) <- daily_filenames
  barplot(data, main = paste0(state, " ", stat), las = 2, cex.axis = 1, cex.names = 0.5)
}

plot.dailyStatRise <- function(state, stat){
  data <- sapply(1:length(daily_filenames), function(x){compiled.stats[[x]][compiled.stats[[x]]$Province_State == state, stat]})
  names(data) <- daily_filenames
  
  rise.stat <- matrix(ncol = length(data) - 1, nrow = 1)
  colnames(rise.stat) <- names(data)[-1]
  for(i in 1:ncol(rise.stat) + 1){
    rise <- data[i] - data[i-1]
    rise.stat[i-1] <- rise
  }
  
  barplot(rise.stat, main = paste0(state, " rise in ",stat), las = 2, cex.axis = 1, cex.names = 0.5)
}

Interactive Plots

Province_State - The name of the State within the USA. Country_Region - The name of the Country (US). Last_Update - The most recent date the file was pushed. Lat - Latitude. Long_ - Longitude. Confirmed - Aggregated confirmed case count for the state. Deaths - Aggregated Death case count for the state. Recovered - Aggregated Recovered case count for the state. Active - Aggregated confirmed cases that have not been resolved (Active = Confirmed - Recovered - Deaths). FIPS - Federal Information Processing Standards code that uniquely identifies counties within the USA. Incident_Rate - confirmed cases per 100,000 persons. People_Tested - Total number of people who have been tested. People_Hospitalized - Total number of people hospitalized. Mortality_Rate - Number recorded deaths * 100/ Number confirmed cases. UID - Unique Identifier for each row entry. ISO3 - Officialy assigned country code identifiers. Testing_Rate - Total number of people tested per 100,000 persons. Hospitalization_Rate - Total number of people hospitalized * 100/ Number of confirmed cases.

Split the dataset into the data and the info for usability.

US.cases.info <- as.matrix(US.cases[,1:12])
US.cases.data <- as.matrix(US.cases[,-c(2:12)])
US.deaths.info <- as.matrix(US.deaths[,1:12])
US.deaths.data <- as.matrix(US.deaths[,-c(2:12)])

rownames(US.cases.info) <- US.cases.info[,1]
US.cases.info <- US.cases.info[,-1]
rownames(US.cases.data) <- US.cases.data[,1]
US.cases.data <- US.cases.data[,-1]
rownames(US.deaths.info) <- US.deaths.info[,1]
US.deaths.info <- US.deaths.info[,-1]
rownames(US.deaths.data) <- US.deaths.data[,1]
US.deaths.data <- US.deaths.data[,-1]


ndays.cases <- ncol(US.cases.data)
ndays.deaths <- ncol(US.deaths.data)

nobs <- nrow(US.cases.data)

—- The Curve —-

state.curve <- function(state, stat = c("cases", "deaths"), logScale = T){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Province_State == state),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Province_State == state),]
  }
  data.sum <- colSums(data)
  day.first.case <- min(which(data.sum > 0))
  n.days <- length(data.sum)
  
  if(logScale == T){
    barplot(data.sum[day.first.case:n.days], 
            main = paste0("Total COVID-19 ", stat," by date in ", state, ", log scale"), 
            log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
  }else{
    barplot(data.sum[day.first.case:n.days], 
            main = paste0("Total COVID-19 ", stat," by date in ", state), 
            las = 2, cex.axis = 1, cex.names = 0.5)
  }
}
state.rise <- function(state, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data.thisState <- US.cases.data[which(US.cases$Province_State == state),]
  }else if(stat == "deaths"){
    data.thisState <- US.deaths.data[which(US.deaths$Province_State == state),]
  }
  
  data.sum <- colSums(data.thisState)
  n.days <- ncol(data.thisState)
  
  rise.cases <- matrix(ncol = n.days - 1, nrow = 1)
  colnames(rise.cases) <- colnames(data.thisState)[-1]
  for(i in 1:ncol(rise.cases) + 1){
    rise <- data.sum[i] - data.sum[i-1]
    rise.cases[i-1] <- rise
  }
  
  day.first.case <- min(which(rise.cases > 0))
  n.days <- length(rise.cases)
  
  barplot(rise.cases[,day.first.case:n.days], main = paste0("Rise in COVID-19 ", stat, " by Date in ", state), las = 2, cex.axis = 1, cex.names = 0.5)
}
county.curve <- function(county, stat = c("cases", "deaths")){
  if(stat == "cases"){
    data <- US.cases.data[which(US.cases$Admin2 == county),]
  }else if(stat == "deaths"){
    data <- US.deaths.data[which(US.deaths$Admin2 == county),]
  }
  
  day.first.case <- min(which(data > 0))
  n.days <- length(data)
  
  barplot(data[day.first.case:n.days], main = paste0("Total COVID-19 ", stat," by date in ", county), log = "y", las = 2, cex.axis = 1, cex.names = 0.5)
  
}

county.curve("Tulsa", "cases")

county.curve("Tulsa", "deaths")

—- Calculate some useful stats to compare with neighborhood data —-

US.stats <- data.frame(UID = US.cases$UID)
cases.total <- colSums(US.cases.data)

day.first.case <- min(which(cases.total > 100))
n.days <- length(cases.total)

par(mar = c(5,5,4,2))
barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(cases.total[day.first.case:n.days], main = "Total COVID-19 cases by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

deaths.total <- colSums(US.deaths.data)

day.first.case <- min(which(deaths.total > 0))
n.days <- length(deaths.total)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

barplot(deaths.total[day.first.case:n.days], main = "Total COVID-19 deaths by Date in US, log scale", las = 2, cex.axis = 1, cex.names = 0.5, log = "y")

Average rise in cases per day

avg.rise.cases

rise.cases <- matrix(ncol = ndays.cases - 1, nrow = nobs)
colnames(rise.cases) <- colnames(US.cases.data)[-1]
for(i in 1:ncol(rise.cases) + 1){
  rise <- US.cases.data[,i] - US.cases.data[,i-1]
  rise.cases[,i-1] <- rise
}

US.stats$avg.rise.cases <- apply(rise.cases, 1, mean)

rise.cases.total <- colSums(rise.cases)

day.first.case <- min(which(rise.cases.total > 0))
n.days <- length(rise.cases.total)

barplot(rise.cases.total[day.first.case:n.days], main = "Rise in Cases of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Average rise in deaths per day

avg.rise.deaths

rise.deaths <- matrix(ncol = ndays.deaths - 1, nrow = nobs)
colnames(rise.deaths) <- colnames(US.deaths.data)[-1]
for(i in 1:ncol(rise.deaths) + 1){
  rise <- US.deaths.data[,i] - US.deaths.data[,i-1]
  rise.deaths[,i-1] <- rise
}

US.stats$avg.rise.deaths <- apply(rise.deaths, 1, mean)

rise.deaths.total <- colSums(rise.deaths)

day.first.case <- min(which(rise.deaths.total > 0))
n.days <- length(rise.deaths.total)

barplot(rise.deaths.total[day.first.case:n.days], main = "Rise in Deaths of COVID-19 by Date in US", las = 2, cex.axis = 1, cex.names = 0.5)

Total cases

total.cases

US.stats$total.cases <- US.cases.data[,ndays.cases]

Total cases per capita

US.stats$total.cases.percap <- US.stats$total.cases / US.cases$Population
US.stats$total.cases.percap[US.cases$Population == 0] <- NA
hist(US.stats$total.cases.percap)

Total deaths

total.deaths

US.stats$total.deaths <- US.deaths.data[,ndays.deaths]

Total deaths per capita

total.deaths.percap

US.stats$total.deaths.percap <- US.stats$total.deaths / US.deaths$Population
US.stats$total.deaths.percap[US.deaths$Population == 0] <- NA

max(US.stats$total.deaths.percap,na.rm = T)
## [1] 0.002597528

Total deaths per case

total.deaths.percase Error in Johns Hopkins data has rows with total.deaths > total.cases.

# pos.case.ind <- US.stats$total.cases > 0
# US.stats$total.deaths.percase[pos.case.ind] <- US.stats$total.deaths[pos.case.ind] / US.stats$total.cases[pos.case.ind]
# US.stats$total.deaths.percase[!pos.case.ind] <- 0
US.stats$total.deaths.percase <- US.stats$total.deaths / US.stats$total.cases
US.stats$total.deaths.percase[US.stats$total.cases == 0] <- NA

US.stats[which(US.stats$total.deaths > US.stats$total.cases),]
##           UID avg.rise.cases avg.rise.deaths total.cases
## 3203 84090002      0.0000000      0.04395604           0
## 3206 84090006      0.0000000      0.02197802           0
## 3222 84090024      0.0000000      1.18681319           0
## 3229 84090031      0.1318681      0.21978022          12
## 3231 84090033      0.1868132      0.49450549          17
## 3250 84090054      0.0000000      0.21978022           0
## 3252 84090056      0.0000000      0.05494505           0
##      total.cases.percap total.deaths total.deaths.percap
## 3203                 NA            4                  NA
## 3206                 NA            2                  NA
## 3222                 NA          108                  NA
## 3229                 NA           20                  NA
## 3231                 NA           45                  NA
## 3250                 NA           20                  NA
## 3252                 NA            5                  NA
##      total.deaths.percase
## 3203                   NA
## 3206                   NA
## 3222                   NA
## 3229             1.666667
## 3231             2.647059
## 3250                   NA
## 3252                   NA

—- Merge COVID data with Neighborhood data —-

US.stats$ID <- str_pad(US.stats$UID, 8, "left", pad = "0")
US.stats$ID <- substr(US.stats$ID, 4, 8)

data.merge <- merge(US.stats, county_factors, by = "ID")

—- Plot the relationships —-

data.cor <- cor(data.merge[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

data.merge2 <- merge(data.merge, county_500CitiesData, by = "ID", all.x = F)

—- Plot the relationships —-

data.cor2 <- cor(data.merge2[,-c(1:2)], use = "complete.obs", method = "spearman")
corrplot.mixed(data.cor2, upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)

corrplot.mixed(data.cor2[1:7,8:42], upper = 'ellipse', lower = 'number', tl.pos = 'lt', tl.cex = 1, lower.col = "black", number.cex = 0.5)